import datetime
import os
import pickle
from typing import Tuple

import gym
import numpy as np
from tqdm import tqdm
from absl import app, flags
from ml_collections import config_flags
from tensorboardX import SummaryWriter

import wrappers as w
from dataset_utils import D4RLDataset, reward_from_preference, reward_from_preference_transformer, split_into_trajectories
from evaluation import evaluate,original_evaluate
from learner import Learner
import copy

os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '.40'

FLAGS = flags.FLAGS

flags.DEFINE_string('env_name', 'halfcheetah-expert-v2', 'Environment name.')
flags.DEFINE_string('save_dir', './runs/', 'Tensorboard logging dir.')
flags.DEFINE_integer('eval_episodes', 10,
                     'Number of episodes used for evaluation.')
flags.DEFINE_integer('log_interval', 5000, 'Logging interval.')
flags.DEFINE_integer('eval_interval', 5000, 'Eval interval.')
flags.DEFINE_integer('max_steps', int(5e4), 'Number of training steps.')
flags.DEFINE_boolean('tqdm', True, 'Use tqdm progress bar.')
flags.DEFINE_boolean('use_reward_model', True, 'Use reward model for relabeling reward.')
flags.DEFINE_string('ckpt_dir',
                    './logs/pref_reward',
                    'ckpt path for reward model.')
flags.DEFINE_integer('seq_len', 50, 'sequence length for relabeling reward in Transformer.')
flags.DEFINE_bool('use_diff', False, 'boolean whether use difference in sequence for reward relabeling.')
flags.DEFINE_string('label_mode', 'last', 'mode for relabeling reward with tranformer.')


def normalize(dataset, env_name, max_episode_steps=1000):
    trajs = split_into_trajectories(dataset.observations, dataset.actions,
                                    dataset.rewards, dataset.masks,
                                    dataset.dones_float,
                                    dataset.next_observations)
    trj_mapper = []
    for trj_idx, traj in tqdm(enumerate(trajs), total=len(trajs), desc="chunk trajectories"):
        traj_len = len(traj)

        for _ in range(traj_len):
            trj_mapper.append((trj_idx, traj_len))

    def compute_returns(traj):
        episode_return = 0
        for _, _, rew, _, _, _ in traj:
            episode_return += rew

        return episode_return

    sorted_trajs = sorted(trajs, key=compute_returns)
    min_return, max_return = compute_returns(sorted_trajs[0]), compute_returns(sorted_trajs[-1])

    normalized_rewards = []
    for i in range(dataset.size):
        _reward = dataset.rewards[i]
        if 'antmaze' in env_name:
            _, len_trj = trj_mapper[i]
            _reward -= min_return / len_trj
        _reward /= max_return - min_return
        _reward *= max_episode_steps
        normalized_rewards.append(_reward)

    dataset.rewards = np.array(normalized_rewards)


def make_env_and_dataset(env_name: str,
                         seed: int, reward_model) :
    import metaworld
    dataset_name = env_name.split('_')[1]
    import metaworld
    ml1 = metaworld.MT1(dataset_name, seed=1337)  # Construct the benchmark, sampling tasks

    env = ml1.train_classes[dataset_name]()  # Create an environment with task
    from gym import wrappers
    env = wrappers.TimeLimit(env, 500)
    env.train_tasks = ml1.train_tasks
    task = ml1.train_tasks[0]
    env.set_task(task)
    env._freeze_rand_vec = False

    dataset_tmp = np.load(
        '/mnt/data/' + dataset_name + '/data_randgoal_08_50_08_batch.npy', allow_pickle=True).tolist()

    dataset = D4RLDataset(env, input_dataset=dataset_tmp)

    # ----------------------------- predict reward -------------------------------
    print('\n', 'model type: ', FLAGS.model_type, '\n')
    dataset = reward_from_preference_transformer(
        FLAGS.env,
        dataset,
        reward_model,
        batch_size=FLAGS.batch_size,
        seq_len=FLAGS.seq_len,
        use_diff=FLAGS.use_diff,
        label_mode=FLAGS.label_mode
    )

    relabeled_dataset = copy.deepcopy(dataset)

    normalize(dataset, FLAGS.env, max_episode_steps=env._max_episode_steps)
    return env, dataset, relabeled_dataset

def initialize_model(env_name):
    model_path = './saved_model/' + f"best_model_{env_name}.pkl"
    with open(model_path, "rb") as f:
        ckpt = pickle.load(f)
    print('reward model loaded...')
    reward_model = ckpt['reward_model']
    return reward_model

import metaworld
from gym import wrappers
max_steps = 50000
env_name = FLAGS.env
dataset_name = env_name.split('_')[1]
ml1 = metaworld.MT1(dataset_name, seed=1337)  # Construct the benchmark, sampling tasks
env = ml1.train_classes[dataset_name]()  # Create an environment with task
env = wrappers.TimeLimit(env, 500)
env.train_tasks = ml1.train_tasks
task = ml1.train_tasks[0]
env.set_task(task)
env._freeze_rand_vec = False
kwargs = dict(FLAGS.config)
agent = Learner(FLAGS.seed,
                env.observation_space.sample()[np.newaxis],
                env.action_space.sample()[np.newaxis],
                max_steps=max_steps,
                **kwargs)

def train_q_network(reward_model, query_index):
    env, dataset, relabeled_dataset = make_env_and_dataset(FLAGS.env, FLAGS.seed, reward_model)
    eval_returns = []
    print('env name: ', FLAGS.env)
    for i in tqdm(range(1, max_steps + 1), smoothing=0.1, disable=not FLAGS.tqdm):
        batch = dataset.sample(FLAGS.batch_size)
        update_info = agent.update(batch)
    return agent, relabeled_dataset